import numpy as np 
import pandas as pd 

data = pd.read_excel("data.xls")                 #导入数据集
x_train = data[["数字化程度","数字化程度（平方）","平均工资"]]
y_train = data[["就业率"]]

from sklearn.preprocessing import MinMaxScaler   #数据中心化
scaler=MinMaxScaler()
x_train_scaled=scaler.fit_transform(x_train)

def c_vector(x):                                 #提取第x列的数值，将所有特征归为一个向量，为一个个体
    c = []
    for i in range(len(x_train_scaled)):
        c.append(x_train_scaled[i][x-1])
    return np.array(c)


import matplotlib.pyplot as plt                 #绘制散点图

x_1 = c_vector(1)
x_2 = c_vector(2)
x_3 = c_vector(3)
y = np.log(y_train/(1-y_train))

fig ,ax = plt.subplots(1,3,figsize=(15,4))

ax[0].scatter(x_1,y)
ax[0].title.set_text('Degree of digitization')
ax[1].scatter(x_2,y)
ax[1].title.set_text("Square term")
ax[2].scatter(x_3,y)
ax[2].title.set_text("average wage")
plt.show()

from sklearn import linear_model                  #模型拟合

model = linear_model.LinearRegression()
model.fit(x_train_scaled[0:9],y_train[0:9])
print(model.intercept_)
print(model.coef_)
y_pre = model.predict(x_train_scaled[-1].reshape(1,-1))              
p_t = np.exp(y_pre)/(1+np.exp(y_pre))              #对2020年就业率作预测

y_pre = model.predict(x_train_scaled)[0:9].reshape(1,9)[0]
y_value = np.array(y_train)[0:9].reshape(1,-1)[0]
sse = np.dot((y_pre-y_value),(y_pre-y_value))
mse = np.sqrt(sse/(len(y_pre)-1))
print(mse)

from scipy.stats import t                         #斜率显著性检验

β = model.coef_[0]
n = 9
x = t(n-1)
sigma_hat_square = sse/(n-1)
cov_β = np.linalg.inv((x_train_scaled.T@x_train_scaled))*sigma_hat_square
p_value = []
for i in range(len(β)):                                          #计算p值
    t_value = (β[i]-0)/cov_β[i][i]
    if t_value < 0:
        p_value.append(2*x.cdf(t_value))
    else:
        p_value.append(2*(1-x.cdf(t_value)))
print(p_value)

#计算SST、R方、调整的R方
sst = 0 
y_bar = np.mean(y_value)
for i in range(len(y_value)):
    sst += (y_value[i]-y_bar)**2

R_square = (sst-sse)/sst
adjust_R_square = 1 - (sse/sst)*n/(n-1)

